library(vegan)
library(permute)
library(simba)
library(cluster)
library(ecodist)
library(fields)

##############################
#PCoA of GO
gofac<-read.csv("gofactor.csv",header=T,row.names=1)
gofac.gower<-daisy(gofac,metric="gower")
gofac.gower
gofac.pcoa<-cmdscale(gofac.gower,k=1000,eig=TRUE,add=TRUE)
gofac.pcoa
#want to know the % variation explained by any principal component? then divide each eigenvalue by the sum of the eigenvalues across all PCs. Below looking at first 5 PCs
gofac.pcoa$eig[1:5]/sum(gofac.pcoa$eig)*100
#look at ordination plot of PC and PC2
ordiplot(gofac.pcoa, choices=c(1,2), type="text", display="sites", xlab="PC1", ylab="PC2")
#here performs a linear regression between each of the original desriptors (species)and the scores from each PC. A permutation test is used to assess statistical significance
vec.gofac<-envfit(gofac.pcoa$points, gofac, perm=1000)
#look at values
vec.gofac
#same plot just w/0 labels
ordiplot(gofac.pcoa, choices=c(1,2), xlab='PC1', ylab="PC2")
plot(vec.gofac, p.max=0.01, col="blue")

###################################################
#PCoA of meth descrip
meth<-read.csv("meth1000.csv",header=T,row.names=1)
#logtransform
meth.log<-log(meth+1)
hist.plots(meth.log)
meth.euc<-daisy(meth.log,metric="euclidean")
meth.pcoa<-cmdscale(meth.euc,k=1000,eig=TRUE,add=TRUE)
meth.pcoa
#want to know the % variation explained by any principal component? then divide each eigenvalue by the sum of the eigenvalues across all PCs. Below looking at first 5 PCs
meth.pcoa$eig[1:5]/sum(meth.pcoa$eig)*100
#look at ordination plot of PC and PC2
ordiplot(meth.pcoa, choices=c(1,2), type="text", display="sites", xlab="PC1 (59.3%)", ylab="PC2 (29.6%)")
#here performs a linear regression between each of the original desriptors (species)and the scores from each PC. A permutation test is used to assess statistical significance
vec.meth<-envfit(meth.pcoa$points, meth.log, perm=1000)
#look at values
vec.meth
#same plot just w/0 labels
ordiplot(meth.pcoa, choices=c(1,2), xlab='PC1 (59.3%)', ylab="PC2 (29.6%)")
plot(vec.meth, p.max=0.01, col="blue")

#####want to do this again but taking out # of CGs
#logtransform
meth2.log<-log(meth[,4:9]+1)
head(meth2.log)
meth.euc2<-daisy(meth2.log,metric="euclidean")
meth2.pcoa<-cmdscale(meth.euc2,k=1000,eig=TRUE,add=TRUE)
meth2.pcoa
#want to know the % variation explained by any principal component? then divide each eigenvalue by the sum of the eigenvalues across all PCs. Below looking at first 5 PCs
meth2.pcoa$eig[1:5]/sum(meth2.pcoa$eig)*100
#look at ordination plot of PC and PC2
ordiplot(meth2.pcoa, choices=c(1,2), type="text", display="sites", xlab="PC1 (79.3%)", ylab="PC2 (18.8%)")
#here performs a linear regression between each of the original desriptors (species)and the scores from each PC. A permutation test is used to assess statistical significance
vec.meth2<-envfit(meth2.pcoa$points, meth2.log, perm=1000)
#look at values
vec.meth2
#same plot just w/0 labels
ordiplot(meth2.pcoa, choices=c(1,2), xlab='PC1 (79.3%)', ylab="PC2 (18.8%)")
plot(vec.meth2, p.max=0.01, col="blue")

#Try a CAP to compare meth and GO??
go.cap<-capscale(gofac.gower~., meth2.log)
summary(go.cap)
#interpretation: inertia corresponds to variance.  Here 1.3% of the variance is explained by the methylation matrix
#can I do it the other way?
go.cap<-capscale(meth.euc2~., gofac)
summary(go.cap)
#interpretation, this only accounted for 2%, 2% of variance in meth could be explained by the GO

######Maybe add in tissue dist
tis<-read.csv('tisdist1000.csv',header=T,row.names=1)
tis.log<-log(tis+1)
hist.plots(tis.log)
tis.bray<-vegdist(tis.log,method='bray')
tis.pcoa<-cmdscale(tis.bray,k=1000,eig=TRUE,add=TRUE)
tis.pcoa
#want to know the % variation explained by any principal component? then divide each eigenvalue by the sum of the eigenvalues across all PCs. Below looking at first 5 PCs
tis.pcoa$eig[1:5]/sum(tis.pcoa$eig)*100
#look at ordination plot of PC and PC2
ordiplot(meth2.pcoa, choices=c(1,2), type="text", display="sites", xlab="PC1 (16.5%)", ylab="PC2 (7.3%)")
#here performs a linear regression between each of the original desriptors (species)and the scores from each PC. A permutation test is used to assess statistical significance
vec.tis<-envfit(tis.pcoa$points, tis.log, perm=1000)
#look at values
vec.tis
#same plot just w/0 labels
ordiplot(tis.pcoa, choices=c(1,2), xlab='PC1 (16.5%)', ylab="PC2 (7.3%)")
plot(vec.tis, p.max=0.01, col="blue")

##transpose it
tis.log<-log(tis+1)
tis.logt<-t(tis.log)
tist.bray<-vegdist(tis.logt,method='bray')
tist.pcoa<-cmdscale(tist.bray,k=5,eig=TRUE,add=TRUE)
tist.pcoa
#want to know the % variation explained by any principal component? then divide each eigenvalue by the sum of the eigenvalues across all PCs. Below looking at first 5 PCs
tist.pcoa$eig[1:5]/sum(tist.pcoa$eig)*100
#look at ordination plot of PC and PC2
ordiplot(tist.pcoa, choices=c(1,2), type="text", display="sites", xlab="PC1 (33.8%)", ylab="PC2 (24.9%)")
#here performs a linear regression between each of the original desriptors (species)and the scores from each PC. A permutation test is used to assess statistical significance
vec.tist<-envfit(tist.pcoa$points, tis.logt, perm=1000)
#look at values
vec.tist
#same plot just w/0 labels
ordiplot(tist.pcoa, choices=c(1,2), xlab='PC1 (16.5%)', ylab="PC2 (7.3%)")
plot(vec.tist, p.max=0.005, col="blue")

#Try a CAP to compare meth and tis??
methfortis<-read.csv('meth1000fortis.csv',header=T,row.names=1)
methfortis.log<-log(methfortis+1)
methfortis.euc<-daisy(methfortis.log,metric="euclidean")
methfortis.cap<-capscale(tis.bray~., methfortis.log)
summary(methfortis.cap)
#interpretation: inertia corresponds to variance.  Here 32.3% of the variance in the tissue distribution is explained by the methylation matrix, CAP1 explains 31.4%, while 1 and 2 explain 32% of the variance
##TO TEST SIGNIFICANCE (Monte Carlo global permutation tests) - several are available
#first look at all constraints simultaneously (considering all environmental constraints at once)
anova(methfortis.cap)
#second we may want to look at each axis in turn - the test based on the first axis has max power against the alternative hypothesis that there is a SINGLE dominating gradient that determines the relation between species and environment.  THis is a sequential test, so the second test just partiions out the first axis and tests the residual variance, then the third test partitions out the first and second etc.
anova(methfortis.cap, by='axis')
#finally we may want to look at each TERM in the model, performs a second signficance test for each term (constraining variable)
anova(methfortis.cap,by='terms')
#ORDINATION SCORES: there are 4 types of scores to look at (we discussed in class that WA scores are the most useful)
#lets use WA scores to visualize the ordination
plot(methfortis.cap,choices=c(1,2), type='points', display='wa', scaling=2)
#or you can just plot site labels:
plot(methfortis.cap,choices=c(1,2), type='n', display='wa', scaling=2)
text(methfortis.cap,choices=c(1,2),labels=row.names(methfortis))
#interpreting the constrained ordination
#canonical coefficients
#these are eigenvector coefficients, "they are not very useful for interpretation"
#intra-set scores
round(intrasetcor(methfortis.cap),5)
#inter-set scores (correlations between species-derived sample scores (WA scores)and environmental variables)
round(intersetcor(methfortis.cap), 5)
#biplot for explanatory variables
#biplot scores for the constraints are given in the summary() of the ordination
summary(methfortis.cap)
#triplot
plot(methfortis.cap,choices=c(1,2),display=c('wa','sp','bp'),scaling=2)

##partial cca?
#######nothing too exciting here
tis<-read.csv('tisdist1000.csv',header=T,row.names=1)
tis.log<-log(tis+1)
tis.bray<-vegdist(tis.log,method='bray')
#separating methmatrix
attach(methfortis.log)
meth<-cbind(meth.exon,meth.intron,meth.total)
desc<-cbind(num.exon,genelength,meanexonlength)
detach(methfortis.log)
#
tis.pcca1<-cca(tis.log,meth,desc)
summary(tis.pcca1)
tis.pcca1<-cca(tis.log,desc,meth)
summary(tis.pcca1)
tis.pcca<-ordi.part(tis.log,meth,desc,method='cca')
plot.ordi.part(tis.pcca,which='total')

#
gofac<-read.csv("gofactor1000fortisbinary.csv",header=T,row.names=1)
head(gofac)
go.pcca1<-ordi.part(gofac,meth,desc,method='cca')
meth.pcca1<-ordi.part(methfortis,desc,gofac,method='cca')

#PCOA of everything
all<-read.csv("alldataforsinglego.csv",header=T,row.names=1)
all.gower<-daisy(all,metric="gower")
all.gower
all.pcoa<-cmdscale(all.gower,k=808,eig=TRUE,add=TRUE)
all.pcoa
#want to know the % variation explained by any principal component? then divide each eigenvalue by the sum of the eigenvalues across all PCs. Below looking at first 5 PCs
all.pcoa$eig[1:5]/sum(all.pcoa$eig)*100
#look at ordination plot of PC and PC2
ordiplot(all.pcoa, choices=c(1,2), type="text", display="sites", xlab="PC1(6.5%)", ylab="PC2(3.6%)")
#here performs a linear regression between each of the original desriptors (species)and the scores from each PC. A permutation test is used to assess statistical significance
vec.all<-envfit(all.pcoa$points, all, perm=1000)
#look at values
vec.all
#same plot just w/0 labels
ordiplot(all.pcoa, choices=c(1,2), xlab='PC1', ylab="PC2")
plot(vec.all, p.max=0.01, col="blue")
points(col=('rainbow')


#I need to try this again w/0 GO because it looks weird
all<-read.csv("alldataforsinglego.csv",header=T,row.names=1)
head(all)
all2<-all[,2:7]
head(all2)
all.gower<-daisy(all2,metric="gower")
all.gower
all.pcoa<-cmdscale(all.gower,k=808,eig=TRUE,add=TRUE)
all.pcoa
#want to know the % variation explained by any principal component? then divide each eigenvalue by the sum of the eigenvalues across all PCs. Below looking at first 5 PCs
all.pcoa$eig[1:5]/sum(all.pcoa$eig)*100
#look at ordination plot of PC and PC2
ordiplot(all.pcoa, choices=c(1,2), type="text", display="sites", xlab="PC1(6.5%)", ylab="PC2(3.6%)")
#here performs a linear regression between each of the original desriptors (species)and the scores from each PC. A permutation test is used to assess statistical significance
vec.all<-envfit(all.pcoa$points, all2, perm=1000)
#look at values
vec.all
#same plot just w/0 labels
ordiplot(all.pcoa, choices=c(1,2), xlab='PC1', ylab="PC2")
plot(vec.all, p.max=0.01, col="blue")

#I need to try this again w/ GO and w/o meth 
all<-read.csv("alldataforsinglego_methlevel.csv",header=T,row.names=1)
head(all)
all2<-all[,2:5]
head(all2)
all.gower<-daisy(all2,metric="gower")
all.gower
all.pcoa<-cmdscale(all.gower,k=808,eig=TRUE,add=TRUE)
all.pcoa
#want to know the % variation explained by any principal component? then divide each eigenvalue by the sum of the eigenvalues across all PCs. Below looking at first 5 PCs
all.pcoa$eig[1:5]/sum(all.pcoa$eig)*100
#look at ordination plot of PC and PC2
ordiplot(all.pcoa, choices=c(1,2), type="text", display="sites", xlab="PC1(6.5%)", ylab="PC2(3.6%)")
#here performs a linear regression between each of the original desriptors (species)and the scores from each PC. A permutation test is used to assess statistical significance
vec.all<-envfit(all.pcoa$points, all2, perm=1000)
#look at values
vec.all
#same plot just w/0 labels
ordiplot(all.pcoa, choices=c(1,2), xlab='PC1', ylab="PC2")
plot(vec.all, p.max=0.05, col="blue")